In [1]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import itertools
import time
import re
import os
import glob
import MeCab
from numpy.random import *

pd.set_option("display.max_rows", 250)
pd.set_option("display.max_columns", 100)

# JumanをMeCabにマッピング

## データの前処理

In [2]:
# データの読み込み
path = "D:/Statistics/data/NLP/"
corpus_path = path + "corpus/"
kyoto_corpus = pd.read_csv(corpus_path + "kyoto_info.csv").iloc[:, 1:]
kwdlc_corpus = pd.read_csv(corpus_path + "kwdlc_info.csv")
kyoto_corpus = kyoto_corpus.sort_values(["sentence_id", "serial_no"])
kwdlc_corpus = kwdlc_corpus.sort_values(["sentence_id", "serial_no"])
kyoto_corpus.index = np.arange(kyoto_corpus.shape[0])
kwdlc_corpus.index = np.arange(kwdlc_corpus.shape[0])

In [3]:
# 文書idとインデックスを定義
# idを定義
unique_sentence1 = np.unique(kyoto_corpus["sentence_id"])
unique_sentence2 = np.unique(kwdlc_corpus["sentence_id"])
sentence_df1 = pd.DataFrame({"id": np.arange(len(unique_sentence1)), "sentence_id": unique_sentence1})
sentence_df2 = pd.DataFrame({"id": np.arange(len(unique_sentence2)), "sentence_id": unique_sentence2})
kyoto_corpus["d_id"] = pd.merge(kyoto_corpus[["sentence_id"]], sentence_df1, on="sentence_id", how="left")["id"]
kwdlc_corpus["d_id"] = pd.merge(kwdlc_corpus[["sentence_id"]], sentence_df2, on="sentence_id", how="left")["id"]
d_id1 = np.array(kyoto_corpus["d_id"])
d_id2 = np.array(kwdlc_corpus["d_id"])
D1 = len(unique_sentence1)
D2 = len(unique_sentence2)
N1 = len(d_id1)
N2 = len(d_id2)

# インデックスを定義
d_list1 = []
d_list2 = []
for i in range(D1):
    d_list1.append(np.where(d_id1==i)[0].astype("int"))
for i in range(D2):
    d_list2.append(np.where(d_id2==i)[0].astype("int"))

## Jumanの結果に対してidを付与する

In [4]:
# kyoto corpusの単語の文字列のidを定義
# 文書を抽出
kyoto_word = np.array(kyoto_corpus["word"])
kyoto_word_no1 = [i for i in range(D1)]
max_id = 0

# 文書ごとに処理を行う
for i in range(D1):
    
    # 単語ごとにidを付与
    index = d_list1[i]
    n = len(index)
    word_no = [j for j in range(n)]
    for j in range(n):
        m = len(kyoto_word[index[j]])
        if j==0:
            serial_no = np.arange(m)[:, np.newaxis] 
            word_serial = np.repeat(max_id, m)[:, np.newaxis]
            word_id = np.repeat(j, m)[:, np.newaxis]
            word_no[j] = np.hstack((word_serial, serial_no, word_id))
            max_no = np.max(word_no[j][:, 1], axis=0)
            max_id += 1
        elif j > 0:
            serial_no = np.arange(m)[:, np.newaxis] + max_no + 1
            word_serial = np.repeat(max_id, m)[:, np.newaxis]
            word_id = np.repeat(j, m)[:, np.newaxis]
            word_no[j] = np.hstack((word_serial, serial_no, word_id))
            max_no = np.max(word_no[j][:, 1], axis=0)
            max_id += 1
            
    # リストに格納
    kyoto_word_no1[i] = np.vstack((word_no))

In [5]:
# kwdlc corpusの単語の文字列のidを定義
# 文書を抽出
kwdlc_word = np.array(kwdlc_corpus["word"])
kwdlc_word_no1 = [i for i in range(D2)]
max_id = 0

# 文書ごとに処理を行う
for i in range(D2):
    
    # 単語ごとにidを付与
    index = d_list2[i]
    n = len(index)
    word_no = [j for j in range(n)]
    for j in range(n):
        m = len(kwdlc_word[index[j]])
        if j==0:
            serial_no = np.arange(m)[:, np.newaxis] 
            word_serial = np.repeat(max_id, m)[:, np.newaxis]
            word_id = np.repeat(j, m)[:, np.newaxis]
            word_no[j] = np.hstack((word_serial, serial_no, word_id))
            max_no = np.max(word_no[j][:, 1], axis=0)
            max_id += 1
        elif j > 0:
            serial_no = np.arange(m)[:, np.newaxis] + max_no + 1
            word_serial = np.repeat(max_id, m)[:, np.newaxis]
            word_id = np.repeat(j, m)[:, np.newaxis]
            word_no[j] = np.hstack((word_serial, serial_no, word_id))
            max_no = np.max(word_no[j][:, 1], axis=0)
            max_id += 1
            
    # リストに格納
    kwdlc_word_no1[i] = np.vstack((word_no))

## MeCabの結果に対してidを付与する

In [6]:
# kyoto corpusの単語の文字列のidを定義
# 結果の格納用配列
mecab_columns = ["word", "word_class", "class_detail1", "class_detail2", "class_detail3",
                 "inflectional1", "inflectional2", "genkei", "readings1", "readings2"]
parsed_list = [i for i in range(D1)]
kyoto_word_no2 = [i for i in range(D1)]
max_id = 0

# 文章ごとにMeCabを実行
for i in range(D1):
    text = kyoto_corpus["word"].iloc[d_list1[i]].str.cat()
    mecab = MeCab.Tagger()
    res = mecab.parse(text)
    parsed_split = pd.Series(res.split("\n")).str.split('\t|,').tolist()
    parsed_list[i] = pd.DataFrame.from_records(parsed_split[0:len(parsed_split)-2])
    parsed_list[i].columns = mecab_columns
    n = parsed_list[i].shape[0]
    
    # 単語ごとにidを付与
    parsed_result = parsed_list[i]
    parsed_word = np.array(parsed_result["word"])
    max_no = 0
    word_no = [j for j in range(n)]
    for j in range(n):
        if j==0:
            m = len(parsed_word[j])
            serial_no = np.arange(m)[:, np.newaxis]
            word_serial = np.repeat(max_id, m)[:, np.newaxis]
            word_id = np.repeat(j, m)[:, np.newaxis]
            word_no[j] = np.hstack((word_serial, serial_no, word_id))
            max_no = np.max(serial_no)
            max_id += 1
        elif j > 0:
            m = len(parsed_word[j])
            serial_no = np.arange(m)[:, np.newaxis] + max_no + 1
            word_serial = np.repeat(max_id, m)[:, np.newaxis]
            word_id = np.repeat(j, m)[:, np.newaxis]
            word_no[j] = np.hstack((word_serial, serial_no, word_id))
            max_no = np.max(serial_no)
            max_id += 1
    word_no = np.vstack((word_no))

    # リストに格納
    kyoto_word_no2[i] = np.vstack((word_no))
    
# リストを結合
kyoto_parsed = pd.concat((parsed_list), axis=0)
kyoto_parsed.index = np.arange(kyoto_parsed.shape[0])

In [7]:
# kwdlc corpusの単語の文字列のidを定義
# 結果の格納用配列
mecab_columns = ["word", "word_class", "class_detail1", "class_detail2", "class_detail3",
                 "inflectional1", "inflectional2", "genkei", "readings1", "readings2"]
parsed_list = [i for i in range(D2)]
kwdlc_word_no2 = [i for i in range(D2)]
max_id = 0

# 文章ごとにMeCabを実行
for i in range(D2):
    text = kwdlc_corpus["word"].iloc[d_list2[i]].str.cat()
    mecab = MeCab.Tagger()
    res = mecab.parse(text)
    parsed_split = pd.Series(res.split("\n")).str.split('\t|,').tolist()
    parsed_list[i] = pd.DataFrame.from_records(parsed_split[0:len(parsed_split)-2])
    row = parsed_list[i].shape[0]
    col = parsed_list[i].shape[1]
    if col < len(mecab_columns):
        parsed_list[i] = pd.concat((parsed_list[i], pd.DataFrame(np.full((row, len(mecab_columns)-col), "*"))), axis=1)
    parsed_list[i].columns = mecab_columns
    n = parsed_list[i].shape[0]
    
    # 単語ごとにidを付与
    parsed_result = parsed_list[i]
    parsed_word = np.array(parsed_result["word"])
    max_no = 0
    word_no = [j for j in range(n)]
    for j in range(n):
        if j==0:
            m = len(parsed_word[j])
            serial_no = np.arange(m)[:, np.newaxis]
            word_serial = np.repeat(max_id, m)[:, np.newaxis]
            word_id = np.repeat(j, m)[:, np.newaxis]
            word_no[j] = np.hstack((word_serial, serial_no, word_id))
            max_no = np.max(serial_no)
            max_id += 1
        elif j > 0:
            m = len(parsed_word[j])
            serial_no = np.arange(m)[:, np.newaxis] + max_no + 1
            word_serial = np.repeat(max_id, m)[:, np.newaxis]
            word_id = np.repeat(j, m)[:, np.newaxis]
            word_no[j] = np.hstack((word_serial, serial_no, word_id))
            max_no = np.max(serial_no)
            max_id += 1
    word_no = np.vstack((word_no))

    # リストに格納
    kwdlc_word_no2[i] = np.vstack((word_no))
    
# リストを結合
kwdlc_parsed = pd.concat((parsed_list), axis=0)
kwdlc_parsed.index = np.arange(kwdlc_parsed.shape[0])

## MeCabの結果をマッピング

In [8]:
# Kyoto corpusの結果をマッピング
# マッピングidを定義
mapping_list = [i for i in range(D1)]
for i in range(D1):
    columns_name = ["word_serial1", "word_serial2"]
    kyoto_id1 = pd.DataFrame(kyoto_word_no1[i], columns=["word_serial1", "serial_no", "word_id1"])
    kyoto_id2 = pd.DataFrame(kyoto_word_no2[i], columns=["word_serial2", "serial_no", "word_id2"])
    mapping = pd.merge(kyoto_id2, kyoto_id1, on="serial_no", how="left")[columns_name]
    mapping_list[i] = np.array(mapping.iloc[np.where(mapping.duplicated()==False)[0]])
kyoto_mapping = np.vstack((mapping_list))
mapping_df = pd.DataFrame(kyoto_mapping, columns=["word_id1", "word_id2"])

# データフレームを結合
columns1 = ["doc_id", "d_id", "sentence_id", "word", "phrase_id", "phrase_dependency", "dependency_type1", "tag_id", 
            "tag_dependency", "dependency_type2", "rel", "target", "sid", "tag"]
columns2 = ["word", "genkei", "word_class", "class_detail1", "class_detail2", "class_detail3", 
            "inflectional1", "inflectional2"]
columns = ["doc_id", "d_id", "sentence_id", "word_id2", "word2", "genkei", "word_class",
           "class_detail1", "class_detail2", "class_detail3", "inflectional1", "inflectional2",
           "phrase_id", "phrase_dependency", "dependency_type1", "tag_id", "tag_dependency", 
           "dependency_type2", "rel", "target", "sid", "tag"]
temp1 = kyoto_corpus[columns1].iloc[kyoto_mapping[:, 0]]
temp2 = kyoto_parsed[columns2].iloc[kyoto_mapping[:, 1]]
temp1 = temp1.rename(columns={"word": "word1"})
temp2 = temp2.rename(columns={"word": "word2"})
temp1.index = np.arange(temp1.shape[0])
temp2.index = np.arange(temp2.shape[0])
temp = pd.concat((mapping_df, temp1, temp2), axis=1)
new_kyoto_corpus = temp[columns].iloc[np.where(temp["word_id2"].duplicated()==False)[0]]
new_kyoto_corpus = new_kyoto_corpus.rename(columns={"word_id2": "word_id", "word2": "word"})
del temp1, temp2

In [9]:
# kwdlc corpusの結果をマッピング
# マッピングidを定義
mapping_list = [i for i in range(D2)]
for i in range(D2):
    columns_name = ["word_serial1", "word_serial2"]
    kwdlc_id1 = pd.DataFrame(kwdlc_word_no1[i], columns=["word_serial1", "serial_no", "word_id1"])
    kwdlc_id2 = pd.DataFrame(kwdlc_word_no2[i], columns=["word_serial2", "serial_no", "word_id2"])
    mapping = pd.merge(kwdlc_id2, kwdlc_id1, on="serial_no", how="left")[columns_name]
    mapping_list[i] = np.array(mapping.iloc[np.where(mapping.duplicated()==False)[0]])
kwdlc_mapping = np.vstack((mapping_list))
mapping_df = pd.DataFrame(kwdlc_mapping, columns=["word_id1", "word_id2"])

# データフレームを結合
columns1 = ["doc_id", "d_id", "sentence_id", "word", "phrase_id", "phrase_dependency", "dependency_type1", "tag_id", 
            "tag_dependency", "dependency_type2", "rel", "target", "sid", "tag"]
columns2 = ["word", "genkei", "word_class", "class_detail1", "class_detail2", "class_detail3", 
            "inflectional1", "inflectional2"]
columns = ["doc_id", "d_id", "sentence_id", "word_id2", "word2", "genkei", "word_class",
           "class_detail1", "class_detail2", "class_detail3", "inflectional1", "inflectional2",
           "phrase_id", "phrase_dependency", "dependency_type1", "tag_id", "tag_dependency", 
           "dependency_type2", "rel", "target", "sid", "tag"]
temp1 = kwdlc_corpus[columns1].iloc[kwdlc_mapping[:, 0]]
temp2 = kwdlc_parsed[columns2].iloc[kwdlc_mapping[:, 1]]
temp1 = temp1.rename(columns={"word": "word1"})
temp2 = temp2.rename(columns={"word": "word2"})
temp1.index = np.arange(temp1.shape[0])
temp2.index = np.arange(temp2.shape[0])
temp = pd.concat((mapping_df, temp1, temp2), axis=1)
new_kwdlc_corpus = temp[columns].iloc[np.where(temp["word_id2"].duplicated()==False)[0]]
new_kwdlc_corpus = new_kwdlc_corpus.rename(columns={"word_id2": "word_id", "word2": "word"})
new_kwdlc_corpus.index = np.arange(new_kwdlc_corpus.shape[0])
del temp, temp1, temp2

In [10]:
new_kyoto_corpus.to_excel(path + "new_kyoto_corpus.xlsx")
new_kyoto_corpus.to_csv(path + "new_kyoto_corpus.csv", index=None)
new_kwdlc_corpus.to_excel(path + "new_kwdlc_corpus.xlsx")
new_kwdlc_corpus.to_csv(path + "new_kwdlc_corpus.csv", index=None)

# フレーズ間の係り受け関係を定義

## Kyoto Corpusの係り受け関係を取得

In [11]:
# idの定義
# 文書idを定義
d_id = np.array(new_kyoto_corpus["d_id"], dtype="int")
D = np.unique(d_id).shape[0]
d_list = [i for i in range(D)]
for i in range(D):
    d_list[i] = np.where(d_id==i)[0].astype("int")
    
# フレーズidの定義
phrase_id = np.array(new_kyoto_corpus["phrase_id"], dtype="int")
d = np.repeat(0, D)
unique_phrase = [i for i in range(D)]
for i in range(D):
    unique_phrase[i] = np.unique(phrase_id[d_list[i]])
    d[i] = unique_phrase[i].shape[0]

In [12]:
# 係り受け関係と述語項関係を取得
# データを抽出
sentence_id = np.array(new_kyoto_corpus["sentence_id"])
phrase_id = np.array(new_kyoto_corpus["phrase_id"], dtype="int")
tag_id = np.array(new_kyoto_corpus["tag_id"], dtype="int")
phrase_dependency = np.array(new_kyoto_corpus["phrase_dependency"], dtype="int")
dependency_type = np.array(new_kyoto_corpus["dependency_type1"])
rel = np.array(new_kyoto_corpus["rel"])
sid = np.array(new_kyoto_corpus["sid"])
tag = np.array(new_kyoto_corpus["tag"])

# フレーズ間のすべての組み合わせを取得
feature_phrase_list = []
feature_id_list = []
for i in range(D):
    flag = np.triu(np.full((d[i], d[i]), 1), k=1)
    block1 = np.repeat(unique_phrase[i], d[i]).reshape(d[i], d[i])
    block2 = np.tile(unique_phrase[i], d[i]).reshape(d[i], d[i])
    feature_phrase_list.append(np.hstack((block1[flag==1][:, np.newaxis], block2[flag==1][:, np.newaxis])))
    feature_id_list.append(np.repeat(i, len(feature_phrase_list[i])))

In [13]:
# 文書ごとに係り受け関係と述語項関係を取得
# 結果の格納用配列
dependency_list = [i for i in range(D)]
rel_flag_list = [i for i in range(D)]
rel_type_list = [i for i in range(D)]

for i in range(D):
    # 係り受け関係を取得
    index = d_list[i]
    phrase_id_ = phrase_id[index]
    tag_id_ = tag_id[index]
    feature_phrase_ = feature_phrase_list[i]
    feature_str = feature_phrase_list[i].astype("U").astype("object")
    dependency1 = feature_str[:, 0] + "-" + feature_str[:, 1]
    dependency2 = phrase_id[index].astype("U").astype("object") + "-" + phrase_dependency[index].astype("U").astype("object")
    dependency_list[i] = np.array(np.in1d(dependency1, dependency2), dtype="int")

    # 述語項関係のデータを定義
    index_tag = np.where((pd.isna(tag[index])==False) & (pd.isna(sid[index])==False) & (pd.isna(rel[index])==False))[0].astype("int")
    m1 = len(index_tag)
    target_sentence = sentence_id[index][0]
    target_sid = sid[index][index_tag]
    target_tag = tag[index][index_tag]
    target_rel = rel[index][index_tag]

    # 述語項関係を取得
    rel_flag_ = np.repeat(0, len(feature_phrase_))
    rel_type_ = np.repeat("", len(feature_phrase_)).astype("object")
    for j1 in range(m1):
        split_sid = str.split(target_sid[j1], "; ")
        split_tag = str.split(target_tag[j1], "; ")
        split_rel = str.split(target_rel[j1], "; ")
        m2 = len(split_sid)

        for j2 in range(m2):
            if (split_sid[j2]!=target_sentence) | (split_sid[j2]==""):
                continue
            index_send = np.where(tag_id_==int(split_tag[j2]))[0].astype("int")
            if len(index_send)==0:
                continue
            send_phrase = phrase_id_[index_send][0]
            receive_phrase = phrase_id_[index_tag[j1]]
            
            if send_phrase!=receive_phrase:
                index_dependency = np.where((feature_phrase_[:, 0]==send_phrase) & (feature_phrase_[:, 1]==receive_phrase))[0].astype("int")
                rel_flag_[index_dependency] = 1
                rel_type_[index_dependency] += split_rel[j2] + "; "

    for j in range(rel_type_.shape[0]):
        if rel_type_[j]!="":
            rel_type_[j] = pd.Series(np.unique(re.split(" ", rel_type_[j]))).str.cat()
            rel_type_[j] = re.sub(";$", "", rel_type_[j])
            
    # データを格納
    rel_flag_list[i] = rel_flag_
    rel_type_list[i] = rel_type_
    
# リストを配列に変換
feature_id = np.hstack((feature_id_list))
feature_phrase = np.vstack((feature_phrase_list))
dependency = np.hstack((dependency_list))
rel_flag = np.hstack((rel_flag_list))
rel_type = np.hstack((rel_type_list))

In [14]:
# 新しいphrase idを定義
# データの定義
phrase = np.unique(np.array(new_kyoto_corpus["d_id"].astype("U") + "-" + new_kyoto_corpus["phrase_id"].astype("U"))).shape[0]
phrase_box1 = new_kyoto_corpus[["d_id", "phrase_id", "phrase_dependency"]]
phrase_box21 = pd.DataFrame(feature_phrase[:, 0], columns=["phrase_id"])
phrase_box22 = pd.DataFrame(feature_phrase[:, 1], columns=["phrase_id"])

# 新しいidの格納用配列
phrase_no_list11 = [i for i in range(D1)]
phrase_no_list12 = [i for i in range(D1)]
phrase_no_list21 = [i for i in range(D1)]
phrase_no_list22 = [i for i in range(D1)]
max_id = 0

# 文書ごとにphrase idを定義
for i in range(D):
    index1 = d_list[i]
    index2 = np.where(feature_id==i)[0].astype("int")
    phrase_temp1 = phrase_box1.iloc[index1]
    phrase_temp21 = phrase_box21.iloc[index2]
    phrase_temp22 = phrase_box22.iloc[index2]
    phrase_id = np.unique(np.array(phrase_temp1["phrase_id"], dtype="int"))
    m = len(phrase_id)
    no = np.arange(m) + max_id

    target_phrase = pd.DataFrame({"phrase_id": phrase_id, "phrase_no": no})
    phrase_no_list11[i] = np.array(pd.merge(phrase_temp1, target_phrase, on="phrase_id", how="left")["phrase_no"])
    phrase_no_list12[i] = np.array(pd.merge(phrase_temp1, target_phrase, 
                                            left_on="phrase_dependency", right_on="phrase_id", how="left")["phrase_no"])
    phrase_no_list21[i] = np.array(pd.merge(phrase_temp21, target_phrase, on="phrase_id", how="left")["phrase_no"])
    phrase_no_list22[i] = np.array(pd.merge(phrase_temp22, target_phrase, on="phrase_id", how="left")["phrase_no"])
    max_id = np.max(no) + 1
    
# リストを配列に変換
phrase_no = np.hstack((phrase_no_list11))
dependency_no = np.hstack((phrase_no_list12))
dependency_no[np.isnan(dependency_no)] = -1
dependency_no = np.array(dependency_no, dtype="int")
feature_no = np.hstack((np.hstack((phrase_no_list21))[:, np.newaxis], np.hstack((phrase_no_list22))[:, np.newaxis]))

In [15]:
# データフレームの作成と出力
# カラムを定義
kyoto_columns = ['serial_no', 'doc_id', 'd_id', 'sentence_id', 'phrase_id', 'phrase_dependency', 'phrase_no', 'dependency_no',
                 'word', 'genkei', 'word_class', 'class_detail1', 'class_detail2', 'class_detail3', 'inflectional1', 'inflectional2',
                 'dependency_type1', 'tag_id', 'tag_dependency', 'dependency_type2', 'rel', 'target', 'sid', 'tag']

# データフレームを作成
kyoto_dependency_feature = pd.DataFrame({"serial_no": np.arange(feature_id.shape[0]), "d_id": feature_id, 
                                         "phrase_id1": feature_phrase[:, 0], "phrase_id2": feature_phrase[:, 1],
                                         "phrase_no1": feature_no[:, 0], "phrase_no2": feature_no[:, 1], "dependency": dependency,
                                         "rel": rel_flag, "rel_type": rel_type})
new_kyoto_corpus["serial_no"] = np.arange(new_kyoto_corpus.shape[0])
new_kyoto_corpus["phrase_no"] = phrase_no
new_kyoto_corpus["dependency_no"] = dependency_no
new_kyoto_corpus = new_kyoto_corpus[kyoto_columns]

# データフレームを出力
new_kyoto_corpus.to_excel(path + "new_kyoto_corpus.xlsx")
new_kyoto_corpus.to_csv(path + "new_kyoto_corpus.csv", index=None)
kyoto_dependency_feature.to_excel(path + "new_kyoto_dependency_feature.xlsx")
kyoto_dependency_feature.to_csv(path + "new_kyoto_dependency_feature.csv", index=None)

## kwdlc corpusの係り受け関係を取得

In [17]:
# idの定義
# 文書idを定義
d_id = np.array(new_kwdlc_corpus["d_id"], dtype="int")
D = np.unique(d_id).shape[0]
d_list = [i for i in range(D)]
for i in range(D):
    d_list[i] = np.where(d_id==i)[0].astype("int")
    
# フレーズidの定義
phrase_id = np.array(new_kwdlc_corpus["phrase_id"], dtype="int")
d = np.repeat(0, D)
unique_phrase = [i for i in range(D)]
for i in range(D):
    unique_phrase[i] = np.unique(phrase_id[d_list[i]])
    d[i] = unique_phrase[i].shape[0]

In [18]:
# 係り受け関係と述語項関係を取得
# データを抽出
sentence_id = np.array(new_kwdlc_corpus["sentence_id"])
phrase_id = np.array(new_kwdlc_corpus["phrase_id"], dtype="int")
tag_id = np.array(new_kwdlc_corpus["tag_id"], dtype="int")
phrase_dependency = np.array(new_kwdlc_corpus["phrase_dependency"], dtype="int")
dependency_type = np.array(new_kwdlc_corpus["dependency_type1"])
rel = np.array(new_kwdlc_corpus["rel"])
sid = np.array(new_kwdlc_corpus["sid"])
tag = np.array(new_kwdlc_corpus["tag"])

# フレーズ間のすべての組み合わせを取得
feature_phrase_list = []
feature_id_list = []
for i in range(D):
    flag = np.triu(np.full((d[i], d[i]), 1), k=1)
    block1 = np.repeat(unique_phrase[i], d[i]).reshape(d[i], d[i])
    block2 = np.tile(unique_phrase[i], d[i]).reshape(d[i], d[i])
    feature_phrase_list.append(np.hstack((block1[flag==1][:, np.newaxis], block2[flag==1][:, np.newaxis])))
    feature_id_list.append(np.repeat(i, len(feature_phrase_list[i])))

In [19]:
# 文書ごとに係り受け関係と述語項関係を取得
# 結果の格納用配列
dependency_list = [i for i in range(D)]
rel_flag_list = [i for i in range(D)]
rel_type_list = [i for i in range(D)]

for i in range(D):
    # 係り受け関係を取得
    index = d_list[i]
    phrase_id_ = phrase_id[index]
    tag_id_ = tag_id[index]
    feature_phrase_ = feature_phrase_list[i]
    feature_str = feature_phrase_list[i].astype("U").astype("object")
    dependency1 = feature_str[:, 0] + "-" + feature_str[:, 1]
    dependency2 = phrase_id[index].astype("U").astype("object") + "-" + phrase_dependency[index].astype("U").astype("object")
    dependency_list[i] = np.array(np.in1d(dependency1, dependency2), dtype="int")

    # 述語項関係のデータを定義
    index_tag = np.where((pd.isna(tag[index])==False) & (pd.isna(sid[index])==False) & (pd.isna(rel[index])==False))[0].astype("int")
    m1 = len(index_tag)
    target_sentence = sentence_id[index][0]
    target_sid = sid[index][index_tag]
    target_tag = tag[index][index_tag]
    target_rel = rel[index][index_tag]

    # 述語項関係を取得
    rel_flag_ = np.repeat(0, len(feature_phrase_))
    rel_type_ = np.repeat("", len(feature_phrase_)).astype("object")
    for j1 in range(m1):
        split_sid = str.split(target_sid[j1], "; ")
        split_tag = str.split(target_tag[j1], "; ")
        split_rel = str.split(target_rel[j1], "; ")
        m2 = len(split_sid)

        for j2 in range(m2):
            if (split_sid[j2]!=target_sentence) | (split_sid[j2]==""):
                continue
            index_send = np.where(tag_id_==int(split_tag[j2]))[0].astype("int")
            if len(index_send)==0:
                continue
            send_phrase = phrase_id_[index_send][0]
            receive_phrase = phrase_id_[index_tag[j1]]
            
            if send_phrase!=receive_phrase:
                index_dependency = np.where((feature_phrase_[:, 0]==send_phrase) & (feature_phrase_[:, 1]==receive_phrase))[0].astype("int")
                rel_flag_[index_dependency] = 1
                rel_type_[index_dependency] += split_rel[j2] + "; "

    for j in range(rel_type_.shape[0]):
        if rel_type_[j]!="":
            rel_type_[j] = pd.Series(np.unique(re.split(" ", rel_type_[j]))).str.cat()
            rel_type_[j] = re.sub(";$", "", rel_type_[j])
            
    # データを格納
    rel_flag_list[i] = rel_flag_
    rel_type_list[i] = rel_type_
    
# リストを配列に変換
feature_id = np.hstack((feature_id_list))
feature_phrase = np.vstack((feature_phrase_list))
dependency = np.hstack((dependency_list))
rel_flag = np.hstack((rel_flag_list))
rel_type = np.hstack((rel_type_list))

In [20]:
# 新しいphrase idを定義
# データの定義
phrase = np.unique(np.array(new_kwdlc_corpus["d_id"].astype("U") + "-" + new_kwdlc_corpus["phrase_id"].astype("U"))).shape[0]
phrase_box1 = new_kwdlc_corpus[["d_id", "phrase_id", "phrase_dependency"]]
phrase_box21 = pd.DataFrame(feature_phrase[:, 0], columns=["phrase_id"])
phrase_box22 = pd.DataFrame(feature_phrase[:, 1], columns=["phrase_id"])

# 新しいidの格納用配列
phrase_no_list11 = [i for i in range(D)]
phrase_no_list12 = [i for i in range(D)]
phrase_no_list21 = [i for i in range(D)]
phrase_no_list22 = [i for i in range(D)]
max_id = 0

# 文書ごとにphrase idを定義
for i in range(D):
    index1 = d_list[i]
    index2 = np.where(feature_id==i)[0].astype("int")
    phrase_temp1 = phrase_box1.iloc[index1]
    phrase_temp21 = phrase_box21.iloc[index2]
    phrase_temp22 = phrase_box22.iloc[index2]
    phrase_id = np.unique(np.array(phrase_temp1["phrase_id"], dtype="int"))
    m = len(phrase_id)
    no = np.arange(m) + max_id

    target_phrase = pd.DataFrame({"phrase_id": phrase_id, "phrase_no": no})
    phrase_no_list11[i] = np.array(pd.merge(phrase_temp1, target_phrase, on="phrase_id", how="left")["phrase_no"])
    phrase_no_list12[i] = np.array(pd.merge(phrase_temp1, target_phrase, 
                                            left_on="phrase_dependency", right_on="phrase_id", how="left")["phrase_no"])
    phrase_no_list21[i] = np.array(pd.merge(phrase_temp21, target_phrase, on="phrase_id", how="left")["phrase_no"])
    phrase_no_list22[i] = np.array(pd.merge(phrase_temp22, target_phrase, on="phrase_id", how="left")["phrase_no"])
    max_id = np.max(no) + 1
    
# リストを配列に変換
phrase_no = np.hstack((phrase_no_list11))
dependency_no = np.hstack((phrase_no_list12))
dependency_no[np.isnan(dependency_no)] = -1
dependency_no = np.array(dependency_no, dtype="int")
feature_no = np.hstack((np.hstack((phrase_no_list21))[:, np.newaxis], np.hstack((phrase_no_list22))[:, np.newaxis]))

In [21]:
# データフレームの作成と出力
# カラムを定義
kwdlc_columns = ['serial_no', 'doc_id', 'd_id', 'sentence_id', 'phrase_id', 'phrase_dependency', 'phrase_no', 'dependency_no',
                 'word', 'genkei', 'word_class', 'class_detail1', 'class_detail2', 'class_detail3', 'inflectional1', 'inflectional2',
                 'dependency_type1', 'tag_id', 'tag_dependency', 'dependency_type2', 'rel', 'target', 'sid', 'tag']

# データフレームを作成
kwdlc_dependency_feature = pd.DataFrame({"serial_no": np.arange(feature_id.shape[0]), "d_id": feature_id, 
                                         "phrase_id1": feature_phrase[:, 0], "phrase_id2": feature_phrase[:, 1],
                                         "phrase_no1": feature_no[:, 0], "phrase_no2": feature_no[:, 1], "dependency": dependency,
                                         "rel": rel_flag, "rel_type": rel_type})
new_kwdlc_corpus["serial_no"] = np.arange(new_kwdlc_corpus.shape[0])
new_kwdlc_corpus["phrase_no"] = phrase_no
new_kwdlc_corpus["dependency_no"] = dependency_no
new_kwdlc_corpus = new_kwdlc_corpus[kwdlc_columns]

# データフレームを出力
new_kwdlc_corpus.to_excel(path + "new_kwdlc_corpus.xlsx")
new_kwdlc_corpus.to_csv(path + "new_kwdlc_corpus.csv", index=None)
kwdlc_dependency_feature.to_excel(path + "new_kwdlc_dependency_feature.xlsx")
kwdlc_dependency_feature.to_csv(path + "new_kwdlc_dependency_feature.csv", index=None)

## 係り受け関係を出力

In [25]:
# 係り受け関係のフレーズ間の組み合わせを取得
# rel_typeを定義
dependency = pd.concat((kyoto_dependency_feature, kwdlc_dependency_feature), axis=0)
dependency.index = np.arange(dependency.shape[0])
temp = np.unique(dependency["rel_type"])[1:]
rel_type = np.unique(np.hstack(([str.split(temp[j], ";") for j in range(len(temp))])))

# データの定義
columns = ["phrase_no1", "phrase_no2", "rel_type"]
kyoto_phrase_no = np.array(new_kyoto_corpus["phrase_no"])
kwdlc_phrase_no = np.array(new_kwdlc_corpus["phrase_no"])
kyoto_word = new_kyoto_corpus["word"]
kwdlc_word = new_kwdlc_corpus["word"]

# データの格納用リスト
kyoto_box_list = []
kwdlc_box_list = []

# rel typeごとに組み合わせを取得
for i in range(len(rel_type)):
    
    # 該当するrel_typeのフレーズを取得
    search_word = "^%s$|;%s;|^%s;|;%s$" % (rel_type[i], rel_type[i], rel_type[i], rel_type[i])
    kyoto_target = kyoto_dependency_feature[columns].iloc[np.where(kyoto_dependency_feature["rel_type"].str.contains(search_word)==True)[0]]
    kwdlc_target = kwdlc_dependency_feature[columns].iloc[np.where(kwdlc_dependency_feature["rel_type"].str.contains(search_word)==True)[0]]
    n1 = kyoto_target.shape[0]
    n2 = kwdlc_target.shape[0]

    # kyoto corpusの係り受けのフレーズ間組み合わせを取得
    if n1 > 0:
        kyoto_box0 = np.full((n1, 4), "", dtype="object")
        for j in range(n1):
            index1 = np.where(kyoto_phrase_no==kyoto_target.iloc[j, 0])[0].astype("int")
            index2 = np.where(kyoto_phrase_no==kyoto_target.iloc[j, 1])[0].astype("int")
            kyoto_box0[j, 0] = kyoto_word.iloc[index1].str.cat(sep=" ")
            kyoto_box0[j, 1] = kyoto_word.iloc[index2].str.cat(sep=" ")

        kyoto_box0[:, 2] = np.repeat(rel_type[i], n1).astype("object")
        kyoto_box0[:, 3] = np.array(kyoto_target["rel_type"])
        kyoto_box_list.append(kyoto_box0)

    # kwdlc corpusの係り受けのフレーズ間組み合わせを取得
    if n2 > 0:
        kwdlc_box = np.full((n2, 4), "", dtype="object")
        for j in range(n2):
            index1 = np.where(kwdlc_phrase_no==kwdlc_target.iloc[j, 0])[0].astype("int")
            index2 = np.where(kwdlc_phrase_no==kwdlc_target.iloc[j, 1])[0].astype("int")
            kwdlc_box[j, 0] = kwdlc_word.iloc[index1].str.cat(sep=" ")
            kwdlc_box[j, 1] = kwdlc_word.iloc[index2].str.cat(sep=" ")

        kwdlc_box[:, 2] = np.repeat(rel_type[i], n2).astype("object")
        kwdlc_box[:, 3] = np.array(kwdlc_target["rel_type"])
        kwdlc_box_list.append(kwdlc_box)

In [26]:
# データフレームを出力
kyoto_rel_dependency = pd.DataFrame(np.vstack((kyoto_box_list)), columns=["phrase1", "phrase2", "rel", "rel_type"])
kwdlc_rel_dependency = pd.DataFrame(np.vstack((kwdlc_box_list)), columns=["phrase1", "phrase2", "rel", "rel_type"])

F1 = kyoto_rel_dependency.shape[0]
F2 = kwdlc_rel_dependency.shape[0]
rel_dependency = pd.concat((kyoto_rel_dependency, kwdlc_rel_dependency), axis=0)
rel_dependency["corpus"] = np.append(np.repeat("kyoto", F1), np.repeat("kwdlc", F2))
rel_dependency = rel_dependency[["corpus", "phrase1", "phrase2", "rel", "rel_type"]]
rel_dependency.index = np.arange(rel_dependency.shape[0])

kyoto_rel_dependency.to_excel(path + "kyoto_rel_type_dependency.xlsx")
kyoto_rel_dependency.to_csv(path + "kyoto_rel_type_dependency.csv", index=None)
kwdlc_rel_dependency.to_excel(path + "kwdlc_rel_type_dependency.xlsx")
kwdlc_rel_dependency.to_csv(path + "kwdlc_rel_type_dependency.csv", index=None)
rel_dependency.to_excel(path + "rel_type_dependency.xlsx")
rel_dependency.to_csv(path + "rel_type_dependency.csv", index=None)