# Argument Structure Analysis by Transformer model

In [1]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot  as plt
import torch
import torch.nn as nn
import torch.optim as optimizers
import torch.nn.functional as F 
import random
from scipy import sparse
from scipy.stats import norm

np.random.seed(9837)
torch.manual_seed(9837)
pd.set_option("display.max_rows", 250)
pd.set_option("display.max_columns", 50)

In [2]:
# 切断ポアソン分布を生成する関数
def rtpois(mu, a, b, n):
    FA = scipy.stats.poisson.cdf(a, mu)
    FB = scipy.stats.poisson.cdf(b, mu)
    return np.array(scipy.stats.poisson.ppf(np.random.uniform(0, 1, n)*(FB-FA)+FA, mu), dtype="int")

# 多項分布の乱数を生成する関数
def rmnom(pr, n, k, pattern):
    if pattern==1:
        z_id = np.array(np.argmax(np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis], axis=1), dtype="int")
        Z = np.diag(np.repeat(1, k))[z_id, ]
        return z_id, Z
    z_id = np.array(np.argmax((np.cumsum(pr, axis=1) >= np.random.uniform(0, 1, n)[:, np.newaxis]), axis=1), dtype="int")
    return z_id

# rel関数を定義
def rel(x):
    x[x < 0] = 0.0
    return x

# データの前処理

## データの読み込みと分割

In [3]:
# データの読み込み
path = "D:/Statistics/data/NLP/"
kyoto_corpus = pd.read_csv(path + "new_kyoto_corpus.csv")
kyoto_dependency = pd.read_csv(path + "new_kyoto_dependency_feature.csv")
kwdlc_corpus = pd.read_csv(path + "new_kwdlc_corpus.csv")
kwdlc_dependency = pd.read_csv(path + "new_kwdlc_dependency_feature.csv")
rel_type_dependency = pd.read_csv(path + "rel_type_dependency.csv")
D1 = np.unique(kyoto_corpus["d_id"]).shape[0]
D2 = np.unique(kwdlc_corpus["d_id"]).shape[0]

In [4]:
# データの分割
split = 0.9
split1 = np.split(np.arange(D1), [int(D1 * split)])
split2 = np.split(np.arange(D2), [int(D2 * split)])
kyoto_corpus1 = kyoto_corpus.iloc[np.where(np.in1d(np.array(kyoto_corpus["d_id"]), split1[0]))[0]]
kyoto_corpus2 = kyoto_corpus.iloc[np.where(np.in1d(np.array(kyoto_corpus["d_id"]), split1[1]))[0]]
kyoto_dependency1 = kyoto_dependency.iloc[np.where(np.in1d(np.array(kyoto_dependency["d_id"]), split1[0]))[0]]
kyoto_dependency2 = kyoto_dependency.iloc[np.where(np.in1d(np.array(kyoto_dependency["d_id"]), split1[1]))[0]]
kwdlc_corpus1 = kwdlc_corpus.iloc[np.where(np.in1d(np.array(kwdlc_corpus["d_id"]), split2[0]))[0]]
kwdlc_corpus2 = kwdlc_corpus.iloc[np.where(np.in1d(np.array(kwdlc_corpus["d_id"]), split2[1]))[0]]
kwdlc_dependency1 = kwdlc_dependency.iloc[np.where(np.in1d(np.array(kwdlc_dependency["d_id"]), split2[0]))[0]]
kwdlc_dependency2 = kwdlc_dependency.iloc[np.where(np.in1d(np.array(kwdlc_dependency["d_id"]), split2[1]))[0]]
kyoto_corpus2.index = np.arange(kyoto_corpus2.shape[0])
kyoto_dependency2.index = np.arange(kyoto_dependency2.shape[0])
kwdlc_corpus2.index = np.arange(kwdlc_corpus2.shape[0])
kwdlc_dependency2.index = np.arange(kwdlc_dependency2.shape[0])

# 学習データの前処理

## データの定義

In [5]:
# idを定義
# 文章idを定義
d_id1 = np.array(kyoto_corpus1["d_id"].iloc[np.where(kyoto_corpus1[["d_id", "phrase_no"]].duplicated()==False)[0]], dtype="int")
d_id2 = np.array(kwdlc_corpus1["d_id"].iloc[np.where(kwdlc_corpus1[["d_id", "phrase_no"]].duplicated()==False)[0]], dtype="int")
d_id = np.append(d_id1, d_id2 + np.max(d_id1) + 1)
d_long1 = np.array(kyoto_corpus1["d_id"], dtype="int")
d_long2 = np.array(kwdlc_corpus1["d_id"], dtype="int")
d_long = np.append(d_long1, d_long2 + np.max(d_long1) + 1)

# phrase idを定義
phrase_id1 = np.array(kyoto_corpus1["phrase_no"], dtype="int")
phrase_id2 = np.array(kwdlc_corpus1["phrase_no"], dtype="int")
phrase_id = np.append(phrase_id1, phrase_id2 + np.max(phrase_id1) + 1)

In [6]:
# データの定義
# 統計量を定義
D = len(np.unique(d_id))
d = np.unique(d_id, return_counts=True)[1].astype("int")
phrase = len(np.unique(phrase_id))
n = np.unique(phrase_id, return_counts=True)[1].astype("int")
N = np.sum(n)
max_m = np.max(np.append(np.max(kyoto_corpus["d_id"].value_counts()), np.max(kwdlc_corpus["d_id"].value_counts())))
max_n = np.max(np.append(np.max(kyoto_corpus["phrase_no"].value_counts()), np.max(kwdlc_corpus["phrase_no"].value_counts())))

# インデックスを定義
d_list1 = [i for i in range(D)]
d_list2 = [i for i in range(D)]
phrase_list = [i for i in range(phrase)]
pt = np.repeat(0, D)
for i in range(D):
    d_list1[i] = np.where(d_id==i)[0].astype("int")
    d_list2[i] = np.where(d_long==i)[0].astype("int")
    pt[i] = d_list2[i].shape[0]
    
for i in range(phrase):
    if i==0:
        max_no = 0
        phrase_list[i] = np.arange(n[i])
        max_no = np.max(phrase_list[i]) + 1
    else:
        phrase_list[i] = max_no + np.arange(n[i])
        max_no = np.max(phrase_list[i]) + 1

In [7]:
# 文書のフレーズを定義
# フレーズ間の組み合わせを定義
kyoto_max = np.max(np.array(kyoto_dependency1[["phrase_no1", "phrase_no2"]]))
kyoto_phrase_id = np.array(kyoto_dependency1["d_id"], dtype="int")
kwdlc_phrase_id = np.array(kwdlc_dependency1["d_id"], dtype="int")
kyoto_phrase_no1 = np.array(kyoto_dependency1["phrase_no1"], dtype="int")
kyoto_phrase_no2 = np.array(kyoto_dependency1["phrase_no2"], dtype="int")
kwdlc_phrase_no1 = np.array(kwdlc_dependency1["phrase_no1"], dtype="int")
kwdlc_phrase_no2 = np.array(kwdlc_dependency1["phrase_no2"], dtype="int")
feature_phrase1 = np.append(kyoto_phrase_no1, kwdlc_phrase_no1 + kyoto_max + 1)
feature_phrase2 = np.append(kyoto_phrase_no2, kwdlc_phrase_no2 + kyoto_max + 1)
feature_id = np.append(kyoto_phrase_id, kwdlc_phrase_id + np.max(kyoto_phrase_id) + 1)
feature_phrase = np.hstack((feature_phrase1[:, np.newaxis], feature_phrase2[:, np.newaxis]))
feature_list = [np.where(feature_id==i)[0].astype("int") for i in range(D)]
F1 = feature_phrase.shape[0]
F2 = feature_phrase.shape[1]

# フレーズ間距離を定義
C = 2
distance = np.array(feature_phrase[:, 1] - feature_phrase[:, 0] <= C, dtype="int")
distance_index = [np.where(distance==0)[0].astype("int"), np.where(distance==1)[0].astype("int")]

# フレーズがあるレコードを抽出
phrase_flag = []
for i in range(D):
    flag = np.repeat(0, max_m)
    flag[np.arange(d[i])] = 1
    phrase_flag.append(flag)
phrase_flag = np.hstack((phrase_flag))
phrase_index = np.where(phrase_flag==1)[0].astype("int")

## Positional Encodingを定義

In [8]:
# 単語単位のpositional encodingをマッピング
# 単語の位置を定義
splits = 30
start = 0.0; end = 0.999
mapping_target1 = np.hstack(([np.arange(n[i]) for i in range(phrase)])) 
allocation1 = np.unique(np.quantile(mapping_target1, q=np.linspace(start, end, splits)).astype("int"))
max_pt1 = len(allocation1)

# 位置idをマッピング
pt_list1 = [j for j in range(max_pt1)]
pt_id1 = np.repeat(0, N)
for j in range(max_pt1):
    if (max_pt1-1) > j:
        pt_list1[j] = np.where((mapping_target1 >= allocation1[j]) & (mapping_target1 < allocation1[j+1]))[0].astype("int")
        pt_id1[pt_list1[j]] = np.repeat(j, pt_list1[j].shape[0])
    if (max_pt1-1)==j:
        pt_list1[j] = np.where(mapping_target1 >= allocation1[j])[0].astype("int")
        pt_id1[pt_list1[j]] = np.repeat(j, pt_list1[j].shape[0])
        
# フレーズの末尾の位置を定義
function_flag = np.repeat(0, N)
for i in range(phrase):
    function_flag[np.max(phrase_list[i])] = 1
pt_id1[function_flag==1] = max_pt1
max_pt1 = np.unique(pt_id1).shape[0]

# idとインデックスを定義
pt_list1 = [j for j in range(max_pt1)]
pt_n1 = np.repeat(0, max_pt1)
for j in range(max_pt1):
    pt_list1[j] = np.where(pt_id1==j)[0].astype("int")
    pt_n1[j] = pt_list1[j].shape[0]

In [9]:
# フレーズ単位のpositional encodingをマッピング
# フレーズの位置を定義
splits = 30
start = 0.0; end = 0.999
mapping_target2 = np.hstack(([np.arange(d[i]) for i in range(D)])) 
allocation2 = np.unique(np.quantile(mapping_target2, q=np.linspace(start, end, splits)).astype("int"))
max_pt2 = len(allocation2)

# 位置idをマッピング
pt_list2 = [j for j in range(max_pt2)]
pt_id2 = np.repeat(0, phrase)
pt_n2 = np.repeat(0, max_pt2)
for j in range(max_pt2):
    if (max_pt2-1) > j:
        pt_list2[j] = np.where((mapping_target2 >= allocation2[j]) & (mapping_target2 < allocation2[j+1]))[0].astype("int")
        pt_id2[pt_list2[j]] = np.repeat(j, pt_list2[j].shape[0])
        pt_n2[j] = pt_list2[j].shape[0]
    if (max_pt2-1)==j:
        pt_list2[j] = np.where(mapping_target2 >= allocation2[j])[0].astype("int")
        pt_id2[pt_list2[j]] = np.repeat(j, pt_list2[j].shape[0])
        pt_n2[j] = pt_list2[j].shape[0]

## 入力単語を定義

In [10]:
# 低頻度の単語を品詞で入れ替える
# テキストの正規化
word_class = np.append(kyoto_corpus["word_class"], kwdlc_corpus["word_class"])
class_detail1 = np.append(kyoto_corpus["class_detail1"], kwdlc_corpus["class_detail1"])
class_detail2 = np.append(kyoto_corpus["class_detail2"], kwdlc_corpus["class_detail2"])
class_detail3 = np.append(kyoto_corpus["class_detail3"], kwdlc_corpus["class_detail3"])
new_genkei = np.append(kyoto_corpus["genkei"], kwdlc_corpus["genkei"])
new_genkei[class_detail1=="数"] = "0"
new_genkei = np.array(pd.Series(new_genkei).str.lower().str.normalize("NFKC"))

# 単語頻度を定義
threshold_freq = 25
word_freq = pd.Series(new_genkei).value_counts()
factorized_word = np.array(word_freq.index)[np.where(word_freq < threshold_freq)[0]]
flag = np.repeat(1, len(factorized_word))

# 名寄対象の単語データフレームを定義
info1 = pd.DataFrame({"serial_no": np.arange(len(new_genkei)), "genkei": new_genkei, "class": word_class, "class_detail1": class_detail1,
                      "class_detail2": class_detail2, "class_detail3": class_detail3})
info2 = pd.DataFrame({"genkei": factorized_word, "flag": flag})
factorized_df = pd.merge(info1, info2, on="genkei", how="left")
factorized_df = factorized_df.iloc[np.where(pd.isna(factorized_df["flag"])==False)[0]]
factorized_df.index = np.arange(factorized_df.shape[0])
index_factorized = np.array(factorized_df["serial_no"], dtype="int")

# 単語を品詞に置き換える
index_detail1 = np.where(factorized_df["class_detail1"]!="*")[0].astype("int")
index_detail2 = np.where(factorized_df["class_detail2"]!="*")[0].astype("int")
new_genkei[index_factorized] = np.array(factorized_df["class"])
new_genkei[index_factorized[index_detail1]] = np.array(factorized_df["class_detail1"].iloc[index_detail1])
new_genkei[index_factorized[index_detail2]] = np.array(factorized_df["class_detail1"].iloc[index_detail2])
del info1, info2, factorized_df

In [11]:
# 単語idおよび活用形id定義
# 学習データのレコード
index1 = np.where(np.in1d(np.array(kyoto_corpus["d_id"]), split1[0]))[0].astype("int")
index2 = np.where(np.in1d(np.append(np.repeat(-1, kyoto_corpus.shape[0]), kwdlc_corpus["d_id"]), split2[0]))[0].astype("int")
index = np.append(index1, index2)

# 単語idをマッピング
unique_word = np.unique(new_genkei)
v1 = unique_word.shape[0]
word_df = pd.DataFrame({"word": unique_word, "id": np.arange(v1)})
word_id = np.array(pd.merge(pd.DataFrame({"word": new_genkei[index]}), word_df, on="word", how="left")["id"])

# 活用形idをマッピング
inflection = np.append(kyoto_corpus["inflectional2"], kwdlc_corpus["inflectional2"])
unique_inflection = np.unique(inflection)
v2 = unique_inflection.shape[0]
inflection_df = pd.DataFrame({"inflection": unique_inflection, "id": np.arange(v2)})
inflection_id = np.array(pd.merge(pd.DataFrame({"inflection": inflection[index]}), inflection_df, on="inflection", how="left")["id"])

In [12]:
# idのベクトルを行列に変換
# 単語レベルのベクトルを行列に変換
word_box = np.full((phrase, max_n+1), v1+1)
inflection_box = np.full((phrase, max_n+1), v2+1)
pt_box = np.full((phrase, max_n+1), max_pt1+1)
for i in range(phrase):
    word_box[i, np.arange(n[i]+1)] = np.append(0, word_id[phrase_list[i]]+1)
    inflection_box[i, np.arange(n[i]+1)] = np.append(0, inflection_id[phrase_list[i]]+1)
    pt_box[i, np.arange(n[i]+1)] = np.append(0, pt_id1[phrase_list[i]]+1)
    
# フレーズレベルのベクトルを行列に変換
phrase_box = np.full((D, max_m), phrase)
for i in range(D):
    phrase_box[i, np.arange(d[i])] = d_list1[i]

## 応答変数の定義

In [13]:
# 係り受けの応答変数を定義
# rel typeのmappingを読み込む
rel_mapping = pd.read_csv(path + "/rel_mapping/rel_mapping.csv")
rel_mapping = rel_mapping.iloc[np.where(pd.isna(rel_mapping["mapping"])==False)[0]]
rel_mapping.index = np.arange(rel_mapping.shape[0])
rel_class = np.append(np.unique(rel_mapping["mapping"]), np.array(["係り受け"]))
classes = len(rel_class)

In [14]:
# パターンごとに係り受け関係を取得
columns = ["dependency", "rel", "rel_type"]
dependency = pd.concat((kyoto_dependency1[columns], kwdlc_dependency1[columns]), axis=0)
dependency.index = np.arange(F1)

Y = np.full((F1, classes), 0)
for j in range(classes-1):
    search_word = "^%s$|;%s;|^%s;|;%s$" % (rel_class[j], rel_class[j], rel_class[j], rel_class[j])
    index = np.where(dependency["rel_type"].str.contains(search_word)==True)[0].astype("int")
    Y[index, j] = 1
Y[np.where(dependency["dependency"]==1)[0], classes-1] = 1

# 検証データの前処理

## データの定義

In [15]:
# idを定義
# 文章idを定義
d_id1 = np.array(kyoto_corpus2["d_id"].iloc[np.where(kyoto_corpus2[["d_id", "phrase_no"]].duplicated()==False)[0]], dtype="int")
d_id2 = np.array(kwdlc_corpus2["d_id"].iloc[np.where(kwdlc_corpus2[["d_id", "phrase_no"]].duplicated()==False)[0]], dtype="int")
d_id0 = np.append(d_id1 - np.min(d_id1), d_id2 - np.min(d_id2) + np.max(d_id1 - np.min(d_id1)) + 1)
d_long1 = np.array(kyoto_corpus2["d_id"], dtype="int")
d_long2 = np.array(kwdlc_corpus2["d_id"], dtype="int")
d_long0 = np.append(d_long1 - np.min(d_long1), d_long2 - np.min(d_long2) + np.max(d_long1 - np.min(d_long1)) + 1)

# phrase idを定義
phrase_id1 = np.array(kyoto_corpus2["phrase_no"], dtype="int")
phrase_id2 = np.array(kwdlc_corpus2["phrase_no"], dtype="int")
phrase_id0 = np.append(phrase_id1 - np.min(phrase_id1), phrase_id2 - np.min(phrase_id2) + np.max(phrase_id1 - np.min(phrase_id1)) + 1)
phrase_master = pd.DataFrame({"id1": np.append(phrase_id1, phrase_id2 + np.max(phrase_id1)), "id2": phrase_id0})
phrase_master = phrase_master.iloc[np.where(phrase_master.duplicated()==False)[0]]
phrase_master.index = np.arange(phrase_master.shape[0])

In [16]:
# データの定義
# 統計量を定義
D0 = len(np.unique(d_id0))
d0 = np.unique(d_id0, return_counts=True)[1].astype("int")
phrase0 = len(np.unique(phrase_id0))
n0 = np.unique(phrase_id0, return_counts=True)[1].astype("int")
N0 = np.sum(n0)

# インデックスを定義
d_list01 = [i for i in range(D0)]
d_list02 = [i for i in range(D0)]
phrase_list0 = [i for i in range(phrase0)]
pt0 = np.repeat(0, D0)
for i in range(D0):
    d_list01[i] = np.where(d_id0==i)[0].astype("int")
    d_list02[i] = np.where(d_long0==i)[0].astype("int")
    pt0[i] = d_list02[i].shape[0]
    
for i in range(phrase0):
    if i==0:
        max_no = 0
        phrase_list0[i] = np.arange(n0[i])
        max_no = np.max(phrase_list0[i]) + 1
    else:
        phrase_list0[i] = max_no + np.arange(n0[i])
        max_no = np.max(phrase_list0[i]) + 1

In [17]:
# 文書のフレーズを定義
# フレーズ間の組み合わせを定義
kyoto_max = np.max(np.array(kyoto_dependency2[["phrase_no1", "phrase_no2"]]))
kyoto_id = np.array(kyoto_dependency2["d_id"], dtype="int")
kwdlc_id = np.array(kwdlc_dependency2["d_id"], dtype="int")
kyoto_phrase_no1 = np.array(kyoto_dependency2["phrase_no1"], dtype="int")
kyoto_phrase_no2 = np.array(kyoto_dependency2["phrase_no2"], dtype="int")
kwdlc_phrase_no1 = np.array(kwdlc_dependency2["phrase_no1"], dtype="int")
kwdlc_phrase_no2 = np.array(kwdlc_dependency2["phrase_no2"], dtype="int")
joint_no1 = pd.DataFrame({"id1": np.append(kyoto_phrase_no1, kwdlc_phrase_no1 + np.max(phrase_id1))})
joint_no2 = pd.DataFrame({"id1": np.append(kyoto_phrase_no2, kwdlc_phrase_no2 + np.max(phrase_id1))})
feature_phrase1 = np.array(pd.merge(joint_no1, phrase_master, on="id1", how="left")["id2"])
feature_phrase2 = np.array(pd.merge(joint_no2, phrase_master, on="id1", how="left")["id2"])

feature_id0 = np.append(kyoto_id - np.min(kyoto_id), kwdlc_id - np.min(kwdlc_id) + np.max(kyoto_id - np.min(kyoto_id)) + 1)
feature_phrase0 = np.hstack((feature_phrase1[:, np.newaxis], feature_phrase2[:, np.newaxis]))
feature_list0 = [np.where(feature_id0==i)[0].astype("int") for i in range(D0)]
F01 = feature_phrase0.shape[0]

# フレーズ間距離を定義
distance0 = np.array(feature_phrase0[:, 1] - feature_phrase0[:, 0] <= C, dtype="int")
distance_index0 = [np.where(distance0==0)[0].astype("int"), np.where(distance0==1)[0].astype("int")]

# フレーズがあるレコードを抽出
phrase_flag0 = []
for i in range(D0):
    flag = np.repeat(0, max_m)
    flag[np.arange(d0[i])] = 1
    phrase_flag0.append(flag)
phrase_flag0 = np.hstack((phrase_flag0))
phrase_index0 = np.where(phrase_flag0==1)[0].astype("int")

## Positional Encodingを定義

In [18]:
# 単語単位のpositional encodingをマッピング
# 単語の位置を定義
mapping_target01 = np.hstack(([np.arange(n0[i]) for i in range(phrase0)])) 

# 位置idをマッピング
pt_list01 = [j for j in range(max_pt1)]
pt_id01 = np.repeat(0, N0)
for j in range(max_pt1-1):
    if (max_pt1-2) > j:
        pt_list01[j] = np.where((mapping_target01 >= allocation1[j]) & (mapping_target01 < allocation1[j+1]))[0].astype("int")
        pt_id01[pt_list01[j]] = np.repeat(j, pt_list01[j].shape[0])
    if (max_pt1-2)==j:
        pt_list01[j] = np.where(mapping_target01 >= allocation1[j])[0].astype("int")
        pt_id01[pt_list01[j]] = np.repeat(j, pt_list01[j].shape[0])
        
# フレーズの末尾の位置を定義
function_flag0 = np.repeat(0, N0)
for i in range(phrase0):
    function_flag0[np.max(phrase_list0[i])] = 1
pt_id01[function_flag0==1] = max_pt1-1

# idとインデックスを定義
pt_list01 = [j for j in range(max_pt1)]
pt_n01 = np.repeat(0, max_pt1)
for j in range(max_pt1):
    pt_list01[j] = np.where(pt_id01==j)[0].astype("int")
    pt_n01[j] = pt_list01[j].shape[0]

In [19]:
# フレーズ単位のpositional encodingをマッピング
# フレーズの位置を定義
mapping_target02 = np.hstack(([np.arange(d0[i]) for i in range(D0)])) 

# 位置idをマッピング
pt_list02 = [j for j in range(max_pt2)]
pt_id02 = np.repeat(0, phrase0)
pt_n02 = np.repeat(0, max_pt2)
for j in range(max_pt2):
    if (max_pt2-1) > j:
        pt_list02[j] = np.where((mapping_target02 >= allocation2[j]) & (mapping_target02 < allocation2[j+1]))[0].astype("int")
        pt_id02[pt_list02[j]] = np.repeat(j, pt_list02[j].shape[0])
        pt_n02[j] = pt_list02[j].shape[0]
    if (max_pt2-1)==j:
        pt_list02[j] = np.where(mapping_target02 >= allocation2[j])[0].astype("int")
        pt_id02[pt_list02[j]] = np.repeat(j, pt_list02[j].shape[0])
        pt_n02[j] = pt_list02[j].shape[0]

## 入力単語を定義

In [20]:
# 単語idおよび活用形idを定義
# 学習データのレコード
index1 = np.where(np.in1d(np.array(kyoto_corpus["d_id"]), split1[1]))[0].astype("int")
index2 = np.where(np.in1d(np.append(np.repeat(-1, kyoto_corpus.shape[0]), kwdlc_corpus["d_id"]), split2[1]))[0].astype("int")
index = np.append(index1, index2)

# idをマッピング
word_id0 = np.array(pd.merge(pd.DataFrame({"word": new_genkei[index]}), word_df, on="word", how="left")["id"])
inflection_id0 = np.array(pd.merge(pd.DataFrame({"inflection": inflection[index]}), inflection_df, on="inflection", how="left")["id"])

In [21]:
# idのベクトルを行列に変換
# 単語レベルのベクトルを行列に変換
word_box0 = np.full((phrase0, max_n+1), v1+1)
inflection_box0 = np.full((phrase0, max_n+1), v2+1)
pt_box0 = np.full((phrase0, max_n+1), max_pt1+1)
for i in range(phrase0):
    word_box0[i, np.arange(n0[i]+1)] = np.append(0, word_id0[phrase_list0[i]]+1)
    inflection_box0[i, np.arange(n0[i]+1)] = np.append(0, inflection_id0[phrase_list0[i]]+1)
    pt_box0[i, np.arange(n0[i]+1)] = np.append(0, pt_id01[phrase_list0[i]]+1)
    
# フレーズレベルのベクトルを行列に変換
phrase_box0 = np.full((D0, max_m), phrase0)
for i in range(D0):
    phrase_box0[i, np.arange(d0[i])] = d_list01[i]

## 応答変数の定義

In [22]:
# パターンごとに係り受け関係を取得
columns = ["dependency", "rel", "rel_type"]
dependency0 = pd.concat((kyoto_dependency2[columns], kwdlc_dependency2[columns]), axis=0)
dependency0.index = np.arange(F01)

Y0 = np.full((F01, classes), 0)
for j in range(classes-1):
    search_word = "^%s$|;%s;|^%s;|;%s$" % (rel_class[j], rel_class[j], rel_class[j], rel_class[j])
    index = np.where(dependency0["rel_type"].str.contains(search_word)==True)[0].astype("int")
    Y0[index, j] = 1
Y0[np.where(dependency0["dependency"]==1)[0], classes-1] = 1

# Argument Structure Analysis by Transformer modelを推定

In [23]:
# Tensor配列を定義
# 応答変数をTensor配列に変換
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Y_ = torch.Tensor(Y).to(device)
Y0_ = torch.Tensor(Y0).to(device)

# 入力変数をTensor配列に変換
word_box_ = torch.LongTensor(word_box)
inflection_box_ = torch.LongTensor(inflection_box)
pt_box_ = torch.LongTensor(pt_box)
pt_id1_ = torch.LongTensor(pt_id1)
pt_id2_ = torch.LongTensor(pt_id2)
phrase_box_ = torch.LongTensor(phrase_box)
phrase_index_ = torch.LongTensor(phrase_index)
feature_phrase_ = torch.LongTensor(feature_phrase)
distance_ = torch.Tensor(distance[:, np.newaxis]).to(device)
unique_phrase = torch.LongTensor(torch.arange(phrase).long())

word_box0_ = torch.LongTensor(word_box0)
inflection_box0_ = torch.LongTensor(inflection_box0)
pt_box0_ = torch.LongTensor(pt_box0)
pt_id01_ = torch.LongTensor(pt_id01)
pt_id02_ = torch.LongTensor(pt_id02)
phrase_box0_ = torch.LongTensor(phrase_box0)
phrase_index0_ = torch.LongTensor(phrase_index0)
feature_phrase0_ = torch.LongTensor(feature_phrase0)
distance0_ = torch.Tensor(distance0[:, np.newaxis]).to(device)
unique_phrase0 = torch.LongTensor(torch.arange(phrase0).long())

In [29]:
# モデルの定義
# 埋め込み層を定義
class Embedding(nn.Module):
    def __init__(self, in_fearture, v1, v2, max_pt1, max_pt2):
        super().__init__()
        self.theta_v = nn.Embedding(num_embeddings=v1+1, embedding_dim=in_features)
        self.theta_f = nn.Embedding(num_embeddings=v2+1, embedding_dim=in_features)
        self.theta_h1 = nn.Embedding(num_embeddings=max_pt1+1, embedding_dim=in_features)
        self.theta_h2 = nn.Embedding(num_embeddings=max_pt2, embedding_dim=in_features)
        
        nn.init.xavier_normal_(self.theta_v.weight)
        nn.init.xavier_normal_(self.theta_f.weight)
        nn.init.xavier_normal_(self.theta_h1.weight)
        nn.init.xavier_normal_(self.theta_h2.weight)
        
    def forward(self, word_box, inflection_box, pt_box, pt_id):
        zeros = torch.full((1, in_features), 0.0).to(device)
        theta_v = torch.cat((self.theta_v(torch.arange(v1+1).to(device)), zeros), dim=0)
        theta_f = torch.cat((self.theta_f(torch.arange(v2+1).to(device)), zeros), dim=0)
        theta_h1 = torch.cat((self.theta_h1(torch.arange(max_pt1+1).to(device)), zeros), dim=0)
        theta_h2 = self.theta_h2(torch.arange(max_pt2).to(device))
        v_features = theta_v[word_box, ]
        f_features = theta_f[inflection_box, ]
        h_features1 = theta_h1[pt_box, ]
        h_features2 = theta_h2[pt_id, ]
        features = v_features + f_features + h_features1
        return features, v_features, f_features, h_features1, h_features2
    
# Self Attention層を定義
class Attention(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.gamma_k = nn.Linear(in_features, in_features, bias=False)
        self.gamma_q = nn.Linear(in_features, in_features, bias=False)
        self.gamma_g = nn.Linear(in_features, in_features, bias=False)
        self.gamma_o = nn.Linear(in_features, in_features, bias=False)

        nn.init.xavier_normal_(self.gamma_k.weight)
        nn.init.xavier_normal_(self.gamma_q.weight)
        nn.init.xavier_normal_(self.gamma_g.weight)
        nn.init.xavier_normal_(self.gamma_o.weight)
        
    def forward(self, features, id_box, k):
        # 全結合層で特徴量を変換
        hidden_k = self.gamma_k(features)
        hidden_q = self.gamma_q(features)
        hidden_g = self.gamma_g(features)

        # Attention Mapを定義
        input_mask = torch.BoolTensor(id_box==k).to(device)
        weights = torch.matmul(hidden_q, hidden_k.transpose(2, 1)) / np.sqrt(in_features)
        mask = input_mask.unsqueeze(1)
        weights = weights.masked_fill(mask==1, -1e9)
        normalized_weights = F.softmax(weights, dim=2)

        # Attention Mapの特徴量を変換
        output = torch.matmul(normalized_weights, hidden_g)
        output = self.gamma_o(output)
        return output, normalized_weights
    
# Transformer Block層を定義
class Transformer(nn.Module):
    def __init__(self, in_features, out_features, dropout_prob):
        super().__init__()
        self.attention_model = Attention(in_features, out_features)
        self.gamma_f1 = nn.Linear(in_features, out_features, bias=False)
        self.gamma_f2 = nn.Linear(out_features, in_features, bias=False)
        self.layernorm1 = nn.LayerNorm(in_features)
        self.layernorm2 = nn.LayerNorm(in_features)
        
        self.dropout1 = nn.Dropout(dropout_prob)
        self.dropout2 = nn.Dropout(dropout_prob)
        nn.init.xavier_normal_(self.gamma_f1.weight)
        nn.init.xavier_normal_(self.gamma_f2.weight)
        
    def forward(self, features, id_box, k):
        # Self Attentionで特徴量を変換
        normalized_features = self.layernorm1(features)
        attention_features, normalized_weights = self.attention_model(features, id_box, k)

        # 正規化とfeed forward層
        dropout_attention = features + self.dropout1(attention_features)
        normalized_attention = self.layernorm2(dropout_attention)
        features_ff1 = self.dropout2(F.relu(self.gamma_f1(normalized_attention)))
        features_ff2 = dropout_attention + self.gamma_f2(features_ff1)
        return features_ff2
    
# 行列分解層を定義
class DMF(nn.Module):
    def __init__(self, in_features, out_features, classes, C):
        super().__init__()
        self.gamma11 = nn.ModuleList([nn.Linear(in_features, out_features) for j in range(C)])
        self.gamma12 = nn.ModuleList([nn.Linear(in_features, out_features) for j in range(C)])
        self.gamma21 = nn.Linear(2*out_features, classes, bias=True)
        self.gamma22 = nn.Linear(out_features, classes, bias=False)
        
        # 重み初期化処理
        for j in range(C):
            nn.init.xavier_normal_(self.gamma11[j].weight)
            nn.init.xavier_normal_(self.gamma12[j].weight)
        nn.init.xavier_normal_(self.gamma21.weight)
        nn.init.xavier_normal_(self.gamma22.weight)
        
    def forward(self, x1, x2, distance):
        ff1 = F.relu(distance*self.gamma11[0](x1) + (1-distance)*self.gamma11[1](x1))
        ff2 = F.relu(distance*self.gamma12[0](x2) + (1-distance)*self.gamma12[1](x2))
        logit = self.gamma21(torch.cat((ff1, ff2), dim=1)) + self.gamma22(ff1 * ff2)
        return logit
    
# 結合層を定義
class Joint(nn.Module):
    def __init__(self, in_features, out_features, out_dim, v1, v2, max_pt1, max_pt2, C, dropout_prob):
        super().__init__()
        self.embedding_model = Embedding(in_features, v1, v2, max_pt1, max_pt2)
        self.transformer_model11 = Transformer(in_features, out_features, dropout_prob)
        self.transformer_model12 = Transformer(in_features, out_features, dropout_prob)
        
        self.transformer_model21_1 = Transformer(in_features, out_features, dropout_prob)
        self.transformer_model21_2 = Transformer(in_features, out_features, dropout_prob)
        self.transformer_model22_1 = Transformer(in_features, out_features, dropout_prob)
        self.transformer_model22_2 = Transformer(in_features, out_features, dropout_prob)
        self.dmf_model = DMF(in_features, out_dim, classes, C)
        
    def forward(self, feature_phrase, distance, word_box, inflection_box, pt_box, phrase_box, pt_id, 
                phrase_index, D, phrase, v1, v2, max_m, zeros):
        features_tensor1, v_features, f_features, h_features1, h_features2 = self.embedding_model(word_box, inflection_box, pt_box, pt_id)
        
        features_transformer11 = self.transformer_model11(features_tensor1, word_box, v1+1)
        features_transformer12 = self.transformer_model12(features_transformer11, word_box, v1+1)
        features_tensor2 = torch.cat((features_transformer12[:, 0, :] + h_features2, zeros), dim=0)[phrase_box, ]
        
        features_transformer21_1 = self.transformer_model21_1(features_tensor2, phrase_box, phrase)
        features_transformer21_2 = self.transformer_model21_2(features_transformer21_1, phrase_box, phrase)
        features_transformer22_1 = self.transformer_model22_1(features_tensor2, phrase_box, phrase)
        features_transformer22_2 = self.transformer_model22_2(features_transformer22_1, phrase_box, phrase)
        
        x1 = features_transformer21_2.reshape(D*max_m, in_features)[phrase_index, ][feature_phrase[:, 0], ]
        x2 = features_transformer22_2.reshape(D*max_m, in_features)[phrase_index, ][feature_phrase[:, 1], ]
        logit = self.dmf_model(x1, x2, distance)
        return logit
    
# 早期終了アルゴリズム
class EarlyStopping:
    '''
    早期終了 (early stopping)
    '''
    def __init__(self, patience=0, verbose=0):
        self._step = 0
        self._loss = float('inf')
        self.patience = patience
        self.verbose = verbose

    def __call__(self, loss):
        if self._loss < loss:
            self._step += 1
            if self._step > self.patience:
                if self.verbose:
                    print('early stopping')
                return True
        else:
            self._step = 0
            self._loss = loss

        return False

In [30]:
# モデルの学習
# ハイパーパラメータを定義
Lambda = torch.tensor(0.0)
in_features = 32
out_features = 128
out_dim = 256
dropout_prob = 0.1
input_mask = torch.BoolTensor(phrase_box==phrase)
input_mask0 = torch.BoolTensor(phrase_box0==phrase0)
zeros1 = torch.Tensor([0.0]).repeat(in_features).reshape(1, in_features)
zeros2 = torch.Tensor([0.0]).repeat(out_features).reshape(1, out_features)
phrase_flag = torch.LongTensor(phrase_box!=phrase)
phrase_flag0 = torch.LongTensor(phrase_box0!=phrase0)

# 対数尤度を定義
def loglike(Y, logit):
    Prob = np.exp(logit) / (1 + np.exp(logit))
    Prob[Prob==1.0] = 0.9999999
    Prob[Prob==0.0] = 0.0000001
    LL = np.sum(Y * np.log(Prob) + (1-Y) * np.log(1-Prob))
    return LL

In [31]:
# アルゴリズムの定義
model = Joint(in_features, out_features, out_dim, v1, v2, max_pt1, max_pt2, C, dropout_prob).to(device)
criterion = nn.BCEWithLogitsLoss(reduction="sum")
optimizer = optimizers.Adam(model.parameters(), lr=0.01, betas=(0.9, 0.99), amsgrad=True, weight_decay=0.25)

def compute_loss(t, y, Lambda):
    Lho = criterion(t, y)
    if Lambda > 0.0:
        l2_reg = torch.tensor(0.)
        for param in model.parameters():
            l2_reg += torch.norm(param)
        Lho += Lambda*l2_reg
    return Lho

def train_step(y, feature_phrase, distance, word_box, inflection_box, pt_box, phrase_box, pt_id, phrase_index, 
               D, phrase, v1, v2, max_m, zeros, model, optimizer, Lambda):
    model.train()
    optimizer.zero_grad()
    mu = model(feature_phrase, distance, word_box, inflection_box, pt_box, phrase_box, pt_id, 
               phrase_index, D, phrase, v1, v2, max_m, zeros)
    Lho = compute_loss(mu, y, Lambda)
    Lho.backward()
    optimizer.step()
    return Lho, mu

def val_step(y, feature_phrase, distance, word_box, inflection_box, pt_box, phrase_box, pt_id,
             phrase_index, D, phrase, v1, v2, max_m, zeros, model, Lambda):
    model.eval()
    mu = model(feature_phrase, distance, word_box, inflection_box, pt_box, phrase_box, pt_id, 
               phrase_index, D, phrase, v1, v2, max_m, zeros)
    Lho = compute_loss(mu, y, Lambda)
    return Lho, mu


# モデルの設定
epochs = 200
n_batches_train = 100
n_batches_val = 50
batch_size = D // n_batches_train
batch_size0 = D0 // n_batches_val
batch_index = np.array_split(np.arange(D), n_batches_train)
batch_index0 = np.array_split(np.arange(D0), n_batches_val)
mini_batch_size = np.array([len(batch_index[i]) for i in range(n_batches_train)])
mini_batch_size0 = np.array([len(batch_index0[i]) for i in range(n_batches_val)])
es = EarlyStopping(patience=5, verbose=1)
hist = {"train_loglike": [], "val_loglike": []}

In [32]:
# 確率的勾配法でモデルパラメータを推定
for rp in range(epochs):
    
    # 学習データでモデルを学習
    random_index = np.argsort(np.random.uniform(0, 1, D)).astype("int")
    preds_train = []
    y_train = []
    train_loglike = np.repeat(0.0, n_batches_train)

    # ミニバッチごとに学習
    for batch in range(n_batches_train):

        # インデックスを定義
        size = mini_batch_size[batch]
        index = np.sort(random_index[batch_index[batch]])
        index1 = np.hstack(([d_list1[index[i]] for i in range(size)]))
        index2 = np.hstack(([feature_list[index[i]] for i in range(size)]))
        phrase_df = pd.DataFrame({"phrase_id": index1, "id": np.arange(len(index1))})
        phrase_ = len(index1)
        
        # ミニバッチを定義
        Y_ = torch.Tensor(Y[index2, ]).to(device)
        word_box_ = word_box[index1, ]
        inflection_box_ = inflection_box[index1, ]
        pt_box_ = pt_box[index1, ]
        pt_id_ = pt_id2[index1]
        phrase_index_ = torch.where(phrase_flag[index, ].reshape(-1)==True)[0]
        feature_phrase_ = torch.LongTensor(feature_phrase[index2, ])
        distance_ = torch.Tensor(distance[index2][:, np.newaxis]).to(device)

        # idを連番に置き換える
        phrase_box_ = torch.LongTensor([phrase_]).repeat(size*max_m)
        phrase_box_[phrase_index_] = torch.arange(phrase_, dtype=torch.long)
        phrase_box_ = phrase_box_.reshape(size, max_m)
        feature_no1_ = pd.merge(pd.DataFrame({"phrase_id": feature_phrase_[:, 0]}), phrase_df, on="phrase_id", how="left")
        feature_no2_ = pd.merge(pd.DataFrame({"phrase_id": feature_phrase_[:, 1]}), phrase_df, on="phrase_id", how="left")
        feature_no_ = torch.LongTensor(pd.concat((feature_no1_["id"], feature_no2_["id"]), axis=1).to_numpy())

        # ADAMでモデルを学習
        # パラメータを更新
        Lho, mu = train_step(Y_, feature_no_, distance_, word_box_, inflection_box_, pt_box_, phrase_box_, pt_id_, 
                             phrase_index_, size, phrase_, v1, v2, max_m, zeros1.to(device), model, optimizer, Lambda)

        # 学習結果を格納
        preds_train.append(mu.cpu().detach().numpy().astype("float"))
        y_train.append(Y_.cpu().detach().numpy().astype("int") )
        train_loglike[batch] = np.array(-Lho.cpu().detach(), dtype="float")
        
    # 学習データの対数尤度を更新
    preds_train = np.vstack((preds_train))
    y_train = np.vstack((y_train))
    LL = loglike(y_train, preds_train)
    hist["train_loglike"].append(LL)
    torch.cuda.empty_cache()

    
    # 検証データでモデルを評価
    preds_val = []
    y_val = []
    val_loglike = np.repeat(0.0, n_batches_val)
    
    # ミニバッチごとに学習
    for batch in range(n_batches_val):
        
        # インデックスを定義
        size = mini_batch_size0[batch]
        index = batch_index0[batch]
        index1 = np.hstack(([d_list01[index[i]] for i in range(size)]))
        index2 = np.hstack(([feature_list0[index[i]] for i in range(size)]))
        phrase_df0 = pd.DataFrame({"phrase_id": index1, "id": np.arange(len(index1))})
        phrase0_ = len(index1)
        
        # ミニバッチを定義
        Y0_ = torch.Tensor(Y0[index2, ]).to(device)
        word_box0_ = word_box0[index1, ]
        inflection_box0_ = inflection_box0[index1, ]
        pt_box0_ = pt_box0[index1, ]
        pt_id02_ = pt_id02[index1]
        phrase_index0_ = torch.where(phrase_flag0[index, ].reshape(-1)==True)[0]
        feature_phrase0_ = torch.LongTensor(feature_phrase0[index2, ])
        distance0_ = torch.Tensor(distance0[index2][:, np.newaxis]).to(device)

        # idを連番に置き換える
        phrase_box0_ = torch.LongTensor([phrase0_]).repeat(size*max_m)
        phrase_box0_[phrase_index0_] = torch.arange(phrase0_, dtype=torch.long)
        phrase_box0_ = phrase_box0_.reshape(size, max_m)
        feature_no01_ = pd.merge(pd.DataFrame({"phrase_id": feature_phrase0_[:, 0]}), phrase_df0, on="phrase_id", how="left")
        feature_no02_ = pd.merge(pd.DataFrame({"phrase_id": feature_phrase0_[:, 1]}), phrase_df0, on="phrase_id", how="left")
        feature_no0_ = torch.LongTensor(pd.concat((feature_no01_["id"], feature_no02_["id"]), axis=1).to_numpy())
        
        # 推定されたモデルを評価
        Lho, mu = val_step(Y0_, feature_no0_, distance0_, word_box0_, inflection_box0_, pt_box0_, phrase_box0_, pt_id02_,
                           phrase_index0_, size, phrase0_, v1, v2, max_m, zeros1.to(device), model, Lambda)

        # 評価結果を格納
        preds_val.append(mu.cpu().detach().numpy().astype("float"))
        y_val.append(Y0_.cpu().detach().numpy().astype("int") )
        val_loglike[batch] = np.array(-Lho.cpu().detach(), dtype="float")
        
    # 学習データの対数尤度を更新
    preds_val = np.vstack((preds_val))
    y_val = np.vstack((y_val))
    LL0 = loglike(Y0, preds_val)
    hist["val_loglike"].append(LL0)
    torch.cuda.empty_cache()

    
    # 学習結果を表示
    print(rp)
    print(np.round([np.sum(train_loglike), np.sum(val_loglike)], 1))
    print(np.round([LL, LL0], 1))
    
    # 早期終了
    if es(-np.sum(val_loglike))==True:
        break

0
[-687873.9  -43243. ]
[-687873.9  -43243. ]
1
[-372921.5  -35527.9]
[-372921.5  -35527.9]
2
[-322848.   -32746.1]
[-322848.1  -32746.1]
3
[-299988.   -31079.9]
[-299988.   -31079.9]
4
[-285747.9  -30066.9]
[-285747.9  -30066.9]
5
[-272515.7  -28717.3]
[-272515.8  -28717.3]
6
[-261881.4  -27995.3]
[-261881.4  -27995.3]
7
[-253580.7  -27368.4]
[-253580.7  -27368.4]
8
[-246758.3  -26924.4]
[-246758.4  -26924.4]
9
[-241425.2  -26562.9]
[-241425.2  -26562.9]
10
[-237275.   -26273.1]
[-237275.1  -26273.1]
11
[-233278.   -25954.7]
[-233278.   -25954.7]
12
[-230098.9  -25736.5]
[-230099.   -25736.5]
13
[-227030.7  -25738.7]
[-227030.7  -25738.7]
14
[-224653.4  -25582.4]
[-224653.4  -25582.4]
15
[-222020.8  -25472. ]
[-222020.8  -25472. ]
16
[-220045.2  -25269.5]
[-220045.2  -25269.5]
17
[-217979.7  -25275.2]
[-217979.8  -25275.2]
18
[-215424.5  -25141.8]
[-215424.5  -25141.8]
19
[-213711.4  -24858.4]
[-213711.4  -24858.4]
20
[-211914.4  -24868.4]
[-211914.4  -24868.4]
21
[-210827.6  -24665. 

In [None]:
j = 14
Prob = np.exp(preds_val[:, j]) / (1 + np.exp(preds_val[:, j]))
res = Y0[:, j]*Prob + (1-Y0[:, j])*(1-Prob)
np.mean(res)

In [None]:
A = np.sum(Y0[:, j]*Prob) / (np.sum(Y0[:, j]*Prob) + np.sum((1-Y0[:, j])*(Prob)))
B = np.sum((1-Y0)[:, j]*(1-Prob)) / (np.sum((1-Y0)[:, j]*(1-Prob)) + np.sum(Y0[:, j]*(1-Prob)))

In [None]:
(2*B*A) / (A + B)

In [None]:
B

In [None]:
0
[-597315.5  -40302.7]
[-597315.6  -40302.7]
1
[-345526.6  -34632.2]
[-345526.7  -34632.3]
2
[-308278.2  -32397.6]
[-308278.3  -32397.7]
3
[-288602.9  -30349.6]
[-288603.   -30349.7]
4
[-273989.4  -29342.1]
[-273989.6  -29342.1]
5
[-262963.1  -28135.6]
[-262963.3  -28135.7]
6
[-255411.8  -27616.4]
[-255411.9  -27616.5]
7
[-248952.6  -27001.9]
[-248952.8  -27002. ]
8
[-244104.1  -26420.1]
[-244104.3  -26420.2]
9
[-239704.2  -26479.5]
[-239704.4  -26479.6]
10
[-236701.3  -25888.1]
[-236701.5  -25888.2]
11
[-232443.2  -25491.1]
[-232443.4  -25491.2]
12
[-230476.9  -25636.2]
[-230477.1  -25636.2]
13
[-228320.2  -25254.9]
[-228320.4  -25255. ]
14
[-227173.4  -25385.7]
[-227173.6  -25385.8]
15
[-225713.5  -24909.5]
[-225713.7  -24909.6]
16
[-222821.8  -24784.1]
[-222822.   -24784.2]
17
[-221800.7  -24810.6]
[-221800.9  -24810.7]
18
[-220962.7  -24656.9]
[-220962.9  -24657. ]
19
[-219160.8  -24583. ]
[-219161.   -24583.1]
20
[-218092.7  -24405.2]
[-218092.9  -24405.3]
21
[-216878.7  -24347.3]
[-216878.9  -24347.4]
22
[-215723.8  -24213.2]
[-215724.   -24213.3]
23
[-214967.   -24222.1]
[-214967.2  -24222.1]
24
[-214054.5  -24021.9]
[-214054.7  -24022. ]
25
[-212480.3  -23861.9]
[-212480.5  -23862. ]

In [None]:
Prob = np.exp(preds_val) / np.sum(np.exp(preds_val), axis=1)[:, np.newaxis]

Y0 * np.log(Prob)

In [None]:
def loglike(Y, logit):
    Prob = np.exp(logit) / (1 + np.exp(logit))
    Prob[Prob==1.0] = 0.9999999
    Prob[Prob==0.0] = 0.0000001
    LL = np.sum(Y * np.log(Prob) + (1-Y)*np.log(1-Prob))
    return LL

In [None]:
Y0.shape

In [None]:
# 結合層を定義
class Joint(nn.Module):
    def __init__(self, in_features, out_features, out_dim, v1, v2, max_pt1, max_pt2, C, dropout_prob):
        super().__init__()
        self.embedding_model = Embedding(in_features, v1, v2, max_pt1, max_pt2)
        self.transformer_model1_11 = Transformer(in_features, out_features, dropout_prob)
        self.transformer_model1_12 = Transformer(in_features, out_features, dropout_prob)
        self.transformer_model1_21 = Transformer(in_features, out_features, dropout_prob)
        self.transformer_model1_22 = Transformer(in_features, out_features, dropout_prob)
        
        self.transformer_model21_1 = Transformer(2*in_features, out_features, dropout_prob)
        self.transformer_model21_2 = Transformer(2*in_features, out_features, dropout_prob)
        self.transformer_model22_1 = Transformer(2*in_features, out_features, dropout_prob)
        self.transformer_model22_2 = Transformer(2*in_features, out_features, dropout_prob)
        self.dmf_model = DMF(2*in_features, out_dim, classes, C)
        
    def forward(self, feature_phrase, distance, word_box, inflection_box, pt_box, phrase_box, pt_id, 
                phrase_index, D, phrase, v1, v2, max_m, zeros):
        features_tensor1, v_features, f_features, h_features1, h_features2 = self.embedding_model(word_box, inflection_box, pt_box, pt_id)
        
        features_transformer1_11 = self.transformer_model1_11(features_tensor1, word_box, v1+1)
        features_transformer1_12 = self.transformer_model1_12(features_transformer1_11, word_box, v1+1)
        features_transformer1_21 = self.transformer_model1_21(features_tensor1, word_box, v1+1)
        features_transformer1_22 = self.transformer_model1_22(features_transformer1_21, word_box, v1+1)
        features_transformer1 = torch.cat((features_transformer1_12[:, 0, :], features_transformer1_22[:, 0, :]), dim=1)
        features_tensor2 = torch.cat((features_transformer1 + h_features2, zeros), dim=0)[phrase_box, ]
        
        features_transformer21_1 = self.transformer_model21_1(features_tensor2, phrase_box, phrase)
        features_transformer21_2 = self.transformer_model21_2(features_transformer21_1, phrase_box, phrase)
        features_transformer22_1 = self.transformer_model22_1(features_tensor2, phrase_box, phrase)
        features_transformer22_2 = self.transformer_model22_2(features_transformer22_1, phrase_box, phrase)
        
        x1 = features_transformer21_2.reshape(D*max_m, 2*in_features)[phrase_index, ][feature_phrase[:, 0], ]
        x2 = features_transformer22_2.reshape(D*max_m, 2*in_features)[phrase_index, ][feature_phrase[:, 1], ]
        logit = self.dmf_model(x1, x2, distance)
        return logit